In [1]:
import sys
from time import perf_counter

import pandas as pd
import numpy as np

from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import KFold, cross_val_score, HalvingGridSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff

import keras_tuner
import tensorflow as tf
from keras.models import Model
from keras.layers import *
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
In [2]:
# plotly settings
px.defaults.width = 800
px.defaults.height = 450
In [3]:
# to have the comp metric
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))
In [4]:
# helper for plotting
def plotly_scatter(df, f1, f2):
    fig = px.scatter(x=df[f1], y=df[f2]).update_layout(
    xaxis_title=f1, yaxis_title=f2, title=f'{f2} vs {f1}')
    fig.show()
In [5]:
# load the data
train = pd.read_parquet(f'./data/train.parquet')
test = pd.read_parquet(f'./data/test.parquet')
In [6]:
# inspect
train.head(20)
Out[6]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave None Reg Lvl AllPub ... 0 None None None 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave None Reg Lvl AllPub ... 0 None None None 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave None IR1 Lvl AllPub ... 0 None None None 0 9 2008 WD Normal 223500
3 4 70 RL 60.0 9550 Pave None IR1 Lvl AllPub ... 0 None None None 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.0 14260 Pave None IR1 Lvl AllPub ... 0 None None None 0 12 2008 WD Normal 250000
5 6 50 RL 85.0 14115 Pave None IR1 Lvl AllPub ... 0 None MnPrv Shed 700 10 2009 WD Normal 143000
6 7 20 RL 75.0 10084 Pave None Reg Lvl AllPub ... 0 None None None 0 8 2007 WD Normal 307000
7 8 60 RL NaN 10382 Pave None IR1 Lvl AllPub ... 0 None None Shed 350 11 2009 WD Normal 200000
8 9 50 RM 51.0 6120 Pave None Reg Lvl AllPub ... 0 None None None 0 4 2008 WD Abnorml 129900
9 10 190 RL 50.0 7420 Pave None Reg Lvl AllPub ... 0 None None None 0 1 2008 WD Normal 118000
10 11 20 RL 70.0 11200 Pave None Reg Lvl AllPub ... 0 None None None 0 2 2008 WD Normal 129500
11 12 60 RL 85.0 11924 Pave None IR1 Lvl AllPub ... 0 None None None 0 7 2006 New Partial 345000
12 13 20 RL NaN 12968 Pave None IR2 Lvl AllPub ... 0 None None None 0 9 2008 WD Normal 144000
13 14 20 RL 91.0 10652 Pave None IR1 Lvl AllPub ... 0 None None None 0 8 2007 New Partial 279500
14 15 20 RL NaN 10920 Pave None IR1 Lvl AllPub ... 0 None GdWo None 0 5 2008 WD Normal 157000
15 16 45 RM 51.0 6120 Pave None Reg Lvl AllPub ... 0 None GdPrv None 0 7 2007 WD Normal 132000
16 17 20 RL NaN 11241 Pave None IR1 Lvl AllPub ... 0 None None Shed 700 3 2010 WD Normal 149000
17 18 90 RL 72.0 10791 Pave None Reg Lvl AllPub ... 0 None None Shed 500 10 2006 WD Normal 90000
18 19 20 RL 66.0 13695 Pave None Reg Lvl AllPub ... 0 None None None 0 6 2008 WD Normal 159000
19 20 20 RL 70.0 7560 Pave None Reg Lvl AllPub ... 0 None MnPrv None 0 5 2009 COD Abnorml 139000

20 rows × 81 columns

In [7]:
# check dtypes.. we can see a lot of non numeric features
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallCond    1460 non-null   int64  
 19  YearBuilt      1460 non-null   int64  
 20  YearRemodAdd   1460 non-null   int64  
 21  RoofStyle      1460 non-null   object 
 22  RoofMatl       1460 non-null   object 
 23  Exterior1st    1460 non-null   object 
 24  Exterior2nd    1460 non-null   object 
 25  MasVnrType     1452 non-null   object 
 26  MasVnrArea     1452 non-null   float64
 27  ExterQual      1460 non-null   object 
 28  ExterCond      1460 non-null   object 
 29  Foundation     1460 non-null   object 
 30  BsmtQual       1423 non-null   object 
 31  BsmtCond       1423 non-null   object 
 32  BsmtExposure   1422 non-null   object 
 33  BsmtFinType1   1423 non-null   object 
 34  BsmtFinSF1     1460 non-null   int64  
 35  BsmtFinType2   1422 non-null   object 
 36  BsmtFinSF2     1460 non-null   int64  
 37  BsmtUnfSF      1460 non-null   int64  
 38  TotalBsmtSF    1460 non-null   int64  
 39  Heating        1460 non-null   object 
 40  HeatingQC      1460 non-null   object 
 41  CentralAir     1460 non-null   object 
 42  Electrical     1459 non-null   object 
 43  1stFlrSF       1460 non-null   int64  
 44  2ndFlrSF       1460 non-null   int64  
 45  LowQualFinSF   1460 non-null   int64  
 46  GrLivArea      1460 non-null   int64  
 47  BsmtFullBath   1460 non-null   int64  
 48  BsmtHalfBath   1460 non-null   int64  
 49  FullBath       1460 non-null   int64  
 50  HalfBath       1460 non-null   int64  
 51  BedroomAbvGr   1460 non-null   int64  
 52  KitchenAbvGr   1460 non-null   int64  
 53  KitchenQual    1460 non-null   object 
 54  TotRmsAbvGrd   1460 non-null   int64  
 55  Functional     1460 non-null   object 
 56  Fireplaces     1460 non-null   int64  
 57  FireplaceQu    770 non-null    object 
 58  GarageType     1379 non-null   object 
 59  GarageYrBlt    1379 non-null   float64
 60  GarageFinish   1379 non-null   object 
 61  GarageCars     1460 non-null   int64  
 62  GarageArea     1460 non-null   int64  
 63  GarageQual     1379 non-null   object 
 64  GarageCond     1379 non-null   object 
 65  PavedDrive     1460 non-null   object 
 66  WoodDeckSF     1460 non-null   int64  
 67  OpenPorchSF    1460 non-null   int64  
 68  EnclosedPorch  1460 non-null   int64  
 69  3SsnPorch      1460 non-null   int64  
 70  ScreenPorch    1460 non-null   int64  
 71  PoolArea       1460 non-null   int64  
 72  PoolQC         7 non-null      object 
 73  Fence          281 non-null    object 
 74  MiscFeature    54 non-null     object 
 75  MiscVal        1460 non-null   int64  
 76  MoSold         1460 non-null   int64  
 77  YrSold         1460 non-null   int64  
 78  SaleType       1460 non-null   object 
 79  SaleCondition  1460 non-null   object 
 80  SalePrice      1460 non-null   int64  
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
In [8]:
# drop Id col but save test_IDs for submission
train.drop("Id", axis = 1, inplace = True)
test_IDs = test["Id"]
test.drop("Id", axis = 1, inplace = True)

EDA and Transforms¶

In [9]:
# check linear corr
corr = train.corr()
plt.subplots(figsize=(15,12))
sns.heatmap(corr, vmax=0.9, cmap="Blues", square=True)
C:\Users\gyenist\AppData\Local\Temp\ipykernel_6324\3622780105.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  corr = train.corr()
Out[9]:
<AxesSubplot: >
In [10]:
# check the highest corrs
corrs = train.corr()['SalePrice'].sort_values(ascending=False)[1:21]
px.bar(corrs).update_layout(
    xaxis_title='feature', yaxis_title='correlation', title='Correlations')
C:\Users\gyenist\AppData\Local\Temp\ipykernel_6324\2144235933.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  corrs = train.corr()['SalePrice'].sort_values(ascending=False)[1:21]
In [11]:
# check the outliers for the most highly correlated features
for c in corrs.index[:5]:
    plotly_scatter(train, c, 'SalePrice')
In [12]:
# filter outliers based on the plots
train.drop(train[(train['OverallQual']<5) & (train['SalePrice']>200000)].index, inplace=True)
train.drop(train[(train['GrLivArea']>4500) & (train['SalePrice']<300000)].index, inplace=True)
train.drop(train[(train['TotalBsmtSF']>4500)].index, inplace=True)
train.drop(train[(train['GarageArea']>1220)].index, inplace=True)
train.reset_index(drop=True, inplace=True)
In [13]:
# check target distribution
fig = ff.create_distplot([train["SalePrice"]], group_labels=['distplot'], bin_size=10000)
fig.update_layout(title_text='Curve and Rug Plot')
fig.show()

We can see that the SalePrice targets have a slight positive skew. We can correct this with a log transform.

In [14]:
# transform targets
train["SalePrice"] = np.log1p(train["SalePrice"])
In [15]:
fig = ff.create_distplot([train["SalePrice"]], group_labels=['distplot'], bin_size=.05)
fig.update_layout(title_text='Curve and Rug Plot')
fig.show()
In [16]:
# separate the targets
y_train = train['SalePrice'].reset_index(drop=True)
train_features = train.drop(['SalePrice'], axis=1)
test_features = test

# combine train and test features, before cleaning
all_features = pd.concat([train_features, test_features]).reset_index(drop=True)
all_features.shape
Out[16]:
(2913, 79)
In [17]:
# convert non cardinal numerical features into strings
all_features['MSSubClass'] = all_features['MSSubClass'].apply(str)
all_features['YrSold'] = all_features['YrSold'].astype(str)
all_features['MoSold'] = all_features['MoSold'].astype(str)
In [18]:
# the data description states that NA refers to typical ('Typ') values
all_features['Functional'] = all_features['Functional'].fillna('Typ')
all_features['Electrical'] = all_features['Electrical'].fillna("SBrkr")
all_features['KitchenQual'] = all_features['KitchenQual'].fillna("TA")

# the data description stats that NA refers to "no pool
all_features["PoolQC"] = all_features["PoolQC"].fillna("None")

# description says nan means no alley access
all_features["Alley"] = all_features["Alley"].fillna("None")

# replace with 0
for col in ('GarageArea', 'GarageCars'):
    all_features[col] = all_features[col].fillna(0)

# nans for these categorical basement features, means there's no basement
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_features[col] = all_features[col].fillna('None')

# fill with the median of the neighborhood
all_features['LotFrontage'] = all_features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
In [19]:
# fill the rest, which we can't interpret with None and 0
objects = []
for i in all_features.columns:
    if all_features[i].dtype == object:
        objects.append(i)
all_features.update(all_features[objects].fillna('None'))

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric = []
for i in all_features.columns:
    if all_features[i].dtype in numeric_dtypes:
        numeric.append(i)
all_features.update(all_features[numeric].fillna(0))
In [20]:
all_features.describe()
Out[20]:
LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF ... GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal
count 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 ... 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000
mean 69.403021 10113.405424 6.088225 5.566083 1971.297288 1984.251287 100.913835 438.686577 49.667353 560.149331 ... 1870.115345 1.763817 471.343289 93.530381 47.260213 23.145898 2.607621 16.095434 2.091658 50.738414
std 21.193771 7758.911341 1.404996 1.113345 30.290390 20.890446 178.092327 443.958903 169.338330 438.974982 ... 450.114935 0.760798 213.120051 126.410731 67.132175 64.301832 25.213828 56.237482 34.585013 567.904167
min 21.000000 1300.000000 1.000000 1.000000 1872.000000 1950.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 60.000000 7472.000000 5.000000 5.000000 1953.000000 1965.000000 0.000000 0.000000 0.000000 220.000000 ... 1957.000000 1.000000 320.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 70.000000 9450.000000 6.000000 5.000000 1973.000000 1993.000000 0.000000 368.000000 0.000000 467.000000 ... 1977.000000 2.000000 479.000000 0.000000 26.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 80.000000 11526.000000 7.000000 6.000000 2001.000000 2004.000000 163.000000 732.000000 0.000000 803.000000 ... 2001.000000 2.000000 576.000000 168.000000 70.000000 0.000000 0.000000 0.000000 0.000000 0.000000
max 313.000000 215245.000000 10.000000 9.000000 2010.000000 2010.000000 1600.000000 4010.000000 1526.000000 2336.000000 ... 2207.000000 5.000000 1488.000000 1424.000000 742.000000 1012.000000 508.000000 576.000000 800.000000 17000.000000

8 rows × 33 columns

In [21]:
# drop features with very low standard deviation -> they contain very little information
all_features = all_features.drop(['Utilities', 'Street', 'PoolQC'], axis=1)
In [22]:
# add some extra features
all_features['BsmtFinType1_Unf'] = 1*(all_features['BsmtFinType1'] == 'Unf')
all_features['HasWoodDeck'] = (all_features['WoodDeckSF'] == 0) * 1
all_features['HasOpenPorch'] = (all_features['OpenPorchSF'] == 0) * 1
all_features['HasEnclosedPorch'] = (all_features['EnclosedPorch'] == 0) * 1
all_features['Has3SsnPorch'] = (all_features['3SsnPorch'] == 0) * 1
all_features['HasScreenPorch'] = (all_features['ScreenPorch'] == 0) * 1
all_features['YearsSinceRemodel'] = all_features['YrSold'].astype(int) - all_features['YearRemodAdd'].astype(int)
all_features['Total_Home_Quality'] = all_features['OverallQual'] + all_features['OverallCond']

all_features['TotalSF'] = all_features['TotalBsmtSF'] + all_features['1stFlrSF'] + all_features['2ndFlrSF']
all_features['YrBltAndRemod'] = all_features['YearBuilt'] + all_features['YearRemodAdd']

all_features['Total_sqr_footage'] = (all_features['BsmtFinSF1'] + all_features['BsmtFinSF2'] +
                                 all_features['1stFlrSF'] + all_features['2ndFlrSF'])
all_features['Total_Bathrooms'] = (all_features['FullBath'] + (0.5 * all_features['HalfBath']) +
                               all_features['BsmtFullBath'] + (0.5 * all_features['BsmtHalfBath']))
all_features['Total_porch_sf'] = (all_features['OpenPorchSF'] + all_features['3SsnPorch'] +
                              all_features['EnclosedPorch'] + all_features['ScreenPorch'] +
                              all_features['WoodDeckSF'])

all_features['haspool'] = all_features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_features['has2ndfloor'] = all_features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
all_features['hasgarage'] = all_features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
all_features['hasbsmt'] = all_features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
all_features['hasfireplace'] = all_features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
In [23]:
all_features.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2913 entries, 0 to 2912
Data columns (total 94 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   MSSubClass          2913 non-null   object 
 1   MSZoning            2913 non-null   object 
 2   LotFrontage         2913 non-null   float64
 3   LotArea             2913 non-null   int64  
 4   Alley               2913 non-null   object 
 5   LotShape            2913 non-null   object 
 6   LandContour         2913 non-null   object 
 7   LotConfig           2913 non-null   object 
 8   LandSlope           2913 non-null   object 
 9   Neighborhood        2913 non-null   object 
 10  Condition1          2913 non-null   object 
 11  Condition2          2913 non-null   object 
 12  BldgType            2913 non-null   object 
 13  HouseStyle          2913 non-null   object 
 14  OverallQual         2913 non-null   int64  
 15  OverallCond         2913 non-null   int64  
 16  YearBuilt           2913 non-null   int64  
 17  YearRemodAdd        2913 non-null   int64  
 18  RoofStyle           2913 non-null   object 
 19  RoofMatl            2913 non-null   object 
 20  Exterior1st         2913 non-null   object 
 21  Exterior2nd         2913 non-null   object 
 22  MasVnrType          2913 non-null   object 
 23  MasVnrArea          2913 non-null   float64
 24  ExterQual           2913 non-null   object 
 25  ExterCond           2913 non-null   object 
 26  Foundation          2913 non-null   object 
 27  BsmtQual            2913 non-null   object 
 28  BsmtCond            2913 non-null   object 
 29  BsmtExposure        2913 non-null   object 
 30  BsmtFinType1        2913 non-null   object 
 31  BsmtFinSF1          2913 non-null   float64
 32  BsmtFinType2        2913 non-null   object 
 33  BsmtFinSF2          2913 non-null   float64
 34  BsmtUnfSF           2913 non-null   float64
 35  TotalBsmtSF         2913 non-null   float64
 36  Heating             2913 non-null   object 
 37  HeatingQC           2913 non-null   object 
 38  CentralAir          2913 non-null   object 
 39  Electrical          2913 non-null   object 
 40  1stFlrSF            2913 non-null   int64  
 41  2ndFlrSF            2913 non-null   int64  
 42  LowQualFinSF        2913 non-null   int64  
 43  GrLivArea           2913 non-null   int64  
 44  BsmtFullBath        2913 non-null   float64
 45  BsmtHalfBath        2913 non-null   float64
 46  FullBath            2913 non-null   int64  
 47  HalfBath            2913 non-null   int64  
 48  BedroomAbvGr        2913 non-null   int64  
 49  KitchenAbvGr        2913 non-null   int64  
 50  KitchenQual         2913 non-null   object 
 51  TotRmsAbvGrd        2913 non-null   int64  
 52  Functional          2913 non-null   object 
 53  Fireplaces          2913 non-null   int64  
 54  FireplaceQu         2913 non-null   object 
 55  GarageType          2913 non-null   object 
 56  GarageYrBlt         2913 non-null   float64
 57  GarageFinish        2913 non-null   object 
 58  GarageCars          2913 non-null   float64
 59  GarageArea          2913 non-null   float64
 60  GarageQual          2913 non-null   object 
 61  GarageCond          2913 non-null   object 
 62  PavedDrive          2913 non-null   object 
 63  WoodDeckSF          2913 non-null   int64  
 64  OpenPorchSF         2913 non-null   int64  
 65  EnclosedPorch       2913 non-null   int64  
 66  3SsnPorch           2913 non-null   int64  
 67  ScreenPorch         2913 non-null   int64  
 68  PoolArea            2913 non-null   int64  
 69  Fence               2913 non-null   object 
 70  MiscFeature         2913 non-null   object 
 71  MiscVal             2913 non-null   int64  
 72  MoSold              2913 non-null   object 
 73  YrSold              2913 non-null   object 
 74  SaleType            2913 non-null   object 
 75  SaleCondition       2913 non-null   object 
 76  BsmtFinType1_Unf    2913 non-null   int32  
 77  HasWoodDeck         2913 non-null   int32  
 78  HasOpenPorch        2913 non-null   int32  
 79  HasEnclosedPorch    2913 non-null   int32  
 80  Has3SsnPorch        2913 non-null   int32  
 81  HasScreenPorch      2913 non-null   int32  
 82  YearsSinceRemodel   2913 non-null   int32  
 83  Total_Home_Quality  2913 non-null   int64  
 84  TotalSF             2913 non-null   float64
 85  YrBltAndRemod       2913 non-null   int64  
 86  Total_sqr_footage   2913 non-null   float64
 87  Total_Bathrooms     2913 non-null   float64
 88  Total_porch_sf      2913 non-null   int64  
 89  haspool             2913 non-null   int64  
 90  has2ndfloor         2913 non-null   int64  
 91  hasgarage           2913 non-null   int64  
 92  hasbsmt             2913 non-null   int64  
 93  hasfireplace        2913 non-null   int64  
dtypes: float64(14), int32(7), int64(30), object(43)
memory usage: 2.0+ MB
In [24]:
# descriptors for ordinal features: we can reduce the feature count a bit if we don't one-hot encode every categorical feature
desc = {
    'Alley':{
        'None':0,
        'Grvl':1,
        'Pave':2
    },
    'Utilities':{
        'ELO': 0,
        'NoSeWa': 1,
        'NoSewr': 2,
        'AllPub': 3

    },
    'ExterQual':{
        'Po': 0,
        'Fa': 1,
        'TA': 2,
        'Gd':3,
        'Ex':4
    },
    'ExterCond': {
        'Po': 0,
        'Fa': 1,
        'TA': 2,
        'Gd':3,
        'Ex':4
    },
    'KitchenQual': {
        'Po': 0,
        'Fa': 1,
        'TA': 2,
        'Gd':3,
        'Ex':4
    }
        }
In [25]:
# convert the above-mentioned features
for col in all_features.columns:
    if col in desc.keys():
        all_features[col] = all_features[col].map(desc[col])
In [26]:
# onehot encode the rest
all_features = pd.get_dummies(all_features).reset_index(drop=True)
In [27]:
all_features.describe()
Out[27]:
LotFrontage LotArea Alley OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea ExterQual ExterCond ... SaleType_New SaleType_None SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial
count 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 ... 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000 2913.000000
mean 69.403021 10113.405424 0.094748 6.088225 5.566083 1971.297288 1984.251287 100.913835 2.395812 2.085479 ... 0.081016 0.000343 0.002403 0.866117 0.065225 0.004119 0.008239 0.015791 0.823550 0.083076
std 21.193771 7758.911341 0.373325 1.404996 1.113345 30.290390 20.890446 178.092327 0.577934 0.372342 ... 0.272907 0.018528 0.048970 0.340585 0.246965 0.064062 0.090409 0.124689 0.381268 0.276044
min 21.000000 1300.000000 0.000000 1.000000 1.000000 1872.000000 1950.000000 0.000000 1.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 60.000000 7472.000000 0.000000 5.000000 5.000000 1953.000000 1965.000000 0.000000 2.000000 2.000000 ... 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
50% 70.000000 9450.000000 0.000000 6.000000 5.000000 1973.000000 1993.000000 0.000000 2.000000 2.000000 ... 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
75% 80.000000 11526.000000 0.000000 7.000000 6.000000 2001.000000 2004.000000 163.000000 3.000000 2.000000 ... 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
max 313.000000 215245.000000 2.000000 10.000000 9.000000 2010.000000 2010.000000 1600.000000 4.000000 4.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 333 columns

In [28]:
all_features
Out[28]:
LotFrontage LotArea Alley OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea ExterQual ExterCond ... SaleType_New SaleType_None SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial
0 65.0 8450 0 7 5 2003 2003 196.0 3 2 ... 0 0 0 1 0 0 0 0 1 0
1 80.0 9600 0 6 8 1976 1976 0.0 2 2 ... 0 0 0 1 0 0 0 0 1 0
2 68.0 11250 0 7 5 2001 2002 162.0 3 2 ... 0 0 0 1 0 0 0 0 1 0
3 60.0 9550 0 7 5 1915 1970 0.0 2 2 ... 0 0 0 1 1 0 0 0 0 0
4 84.0 14260 0 8 5 2000 2000 350.0 3 2 ... 0 0 0 1 0 0 0 0 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2908 21.0 1936 0 4 7 1970 1970 0.0 2 2 ... 0 0 0 1 0 0 0 0 1 0
2909 21.0 1894 0 4 5 1970 1970 0.0 2 2 ... 0 0 0 1 1 0 0 0 0 0
2910 160.0 20000 0 5 7 1960 1996 0.0 2 2 ... 0 0 0 1 1 0 0 0 0 0
2911 62.0 10441 0 5 5 1992 1992 0.0 2 2 ... 0 0 0 1 0 0 0 0 1 0
2912 74.0 9627 0 7 5 1993 1994 94.0 2 2 ... 0 0 0 1 0 0 0 0 1 0

2913 rows × 333 columns

In [29]:
X_train = all_features.iloc[:len(y_train), :]
X_test = all_features.iloc[len(y_train):, :]
X_train.shape, y_train.shape, X_test.shape
Out[29]:
((1454, 333), (1454,), (1459, 333))

Decision Tree¶

We train a single regression tree and we use grid search with 10 fold cross validation for hyperparameter tuning. The grid search is commented out so the notebook can be ran without waiting for it to complete.

In [30]:
# # set the parameter grid
# parameters={
#             "max_features": list(np.arange(100, 320, 2)) + [len(all_features.columns)],
#             "max_leaf_nodes": list(np.arange(50, 100, 2))
#             }
# # get an estimator object
# estimator = DecisionTreeRegressor()
#
# # run optimizer
# t_start = perf_counter()
# sh_dfl = GridSearchCV(estimator, parameters, cv=10, verbose=1).fit(X_train, y_train)
# t_stop = perf_counter()
#
# # reuslts
# print(f'Took {(t_stop-t_start)/60:.2f} mins to optimize\nBest params:')
# [print(f'{k}: {v}') for k, v in sh_dfl.best_params_.items()]
In [31]:
# Example output. I ran this a few times with slightly different settings.
# Fitting 10 folds for each of 2775 candidates, totalling 27750 fits
# Took 13.74 mins to optimize
# Best params:
# max_features: 236
# max_leaf_nodes: 54
In [32]:
# dt = DecisionTreeRegressor(**sh_dfl.best_params_)
In [33]:
dt = DecisionTreeRegressor(max_features=236, max_leaf_nodes=54)
In [34]:
# Setup cross validation folds
kf = KFold(n_splits=25, random_state=999, shuffle=True)
MSE_scorer = make_scorer(mean_squared_error)
In [35]:
scores = np.sqrt(cross_val_score(dt, X_train, y_train, scoring=MSE_scorer, cv=kf))
print(f'single tree scored {np.mean(scores):.6f} with std of {np.std(scores):.6f}')
single tree scored 0.180911 with std of 0.029965
In [36]:
# train on the full dataset
dt_full = dt.fit(X_train, y_train)
In [37]:
# check some tree properties
dt_full.get_depth(), dt_full.get_n_leaves()
Out[37]:
(8, 54)
In [38]:
# score
score = np.sqrt(MSE_scorer(dt_full, X_train, y_train))
print(f'MSE score on train data: {score}')
MSE score on train data: 0.12404532522357475
In [39]:
# predict on test set and transform back the preds
preds_test = np.expm1(dt_full.predict(X_test))
In [40]:
preds_final = pd.DataFrame.from_dict({"Id": test_IDs, "SalePrice": preds_test})
preds_final
Out[40]:
Id SalePrice
0 1461 124581.694917
1 1462 137042.410372
2 1463 149121.875345
3 1464 197911.415304
4 1465 199884.603043
... ... ...
1454 2915 81539.297943
1455 2916 103087.648835
1456 2917 149121.875345
1457 2918 99079.167550
1458 2919 214936.395618

1459 rows × 2 columns

In [41]:
preds_final.to_csv("./data/preds_with_hpo.csv", index=None)

Freestyle¶

For the freestyle mode we use two classical ensemble models and a dense neural network. Hyperparameter search for these can be very computatinally expensive, so it is only used to find a base neural network architecture. We just use the mean of the models' predictions for the final ensemble submission.

Classical Ensemble¶

In [42]:
# create and train a GB regressor just with default parameters and check if we get a good score
gb = GradientBoostingRegressor()
kf = KFold(n_splits=25, random_state=999, shuffle=True)
MSE_scorer = make_scorer(mean_squared_error)
scores = np.sqrt(cross_val_score(gb, X_train, y_train, scoring=MSE_scorer, cv=kf))
print(f'gb scored {np.mean(scores):.6f} with std of {np.std(scores):.6f}')
gb scored 0.118587 with std of 0.020774
In [43]:
# fit on the full data
gb_full = gb.fit(X_train, y_train)
preds_test_gb = np.expm1(gb_full.predict(X_test))
preds_final_gb = pd.DataFrame.from_dict({"Id": test_IDs, "SalePrice": preds_test_gb})
In [44]:
# same for the random forest
rf = RandomForestRegressor()
kf = KFold(n_splits=25, random_state=999, shuffle=True)
MSE_scorer = make_scorer(mean_squared_error)
scores = np.sqrt(cross_val_score(rf, X_train, y_train, scoring=MSE_scorer, cv=kf))
print(f'rf scored {np.mean(scores):.6f} with std of {np.std(scores):.6f}')
rf scored 0.130527 with std of 0.023893
In [45]:
rf_full = rf.fit(X_train, y_train)
preds_test_rf = np.expm1(rf_full.predict(X_test))
preds_final_rf = pd.DataFrame.from_dict({"Id": test_IDs, "SalePrice": preds_test_gb})

Deep Learning Model¶

In [46]:
# just copy the data from the previous models and add back the targets for norm
df_norm = X_train.copy()
df_norm['HousePrice'] = y_train

# shuffle split and norm
split = .8
split_id = round(split*len(df_norm))

df_norm = df_norm.sample(frac=1, random_state=333).reset_index(drop=True)

scaler = StandardScaler()
df_norm = scaler.fit_transform(df_norm)

df_train = df_norm[:split_id, :]
df_test = df_norm[split_id:, :]

# separate targets
x_train = df_train[:, :-1].astype('float32')
y_train = df_train[:, -1].astype('float32')

x_eval = df_test[:, :-1].astype('float32')
y_eval = df_test[:, -1].astype('float32')

Searching for a good base architecture using keras_tuner.Hyperband -> this can be much faster than Bayesian optimization or grid search. The code is commented out so the notebook can be ran without waiting for the search.

In [47]:
def model_builder(hp):
    max_depth = 3
    widths = []
    drops = []
    depth = hp.Int(f'depth', min_value=1, max_value=max_depth, step=1)
    act = hp.Choice('act', values=['swish', 'relu'])

    for i in range(max_depth):
        widths.append(hp.Int(f'dense_width_{i}', min_value=16, max_value=2048, step=16))
        drops.append(hp.Float(f'dropout_val_{i}', min_value=.1, max_value=.7))

    inp = Input(shape=(df_train.shape[-1]-1))
    for i in range(depth):
        if i == 0:
            x = Dense(units=round(widths[i]), activation=act)(inp)
        else:
            x = Dense(units=round(widths[i]), activation=act)(x)
        x = Dropout(drops[i])(x)
    x = Dense(1, activation='linear')(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(optimizer=Adam(), loss=tf.keras.losses.mean_squared_error)

    return model
In [48]:
# tuner = keras_tuner.Hyperband(
#     model_builder,
#     max_epochs=100,
#     factor=3,
#     objective="val_loss",
#     directory='./data/DL',
#     project_name=f'HS_LOGS',
#     overwrite=True
# )
# tuner.search_space_summary()
# tuner.search(x_train, y_train, epochs=100, validation_split=.2, batch_size=128, verbose=1,
#              shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)])
#
# models = tuner.get_best_models(num_models=5)
#
# best_model = models[0]
#
# print(tuner.results_summary())
# print(best_model.summary())
#
# original_stdout = sys.stdout
# with open(f'./data/DL/LOG.log', 'w') as f:
#     sys.stdout = f
#     print(f'\n***TUNER SUMMARY***\n')
#     print(tuner.results_summary())
#
#     print(f'\n\n***MODELS SUMMARY***\n')
#     for rank, model in enumerate(models):
#         print(f'\nMODEL RANK {rank} - STRUCTURE:')
#         print(model.summary())
#     sys.stdout = original_stdout
In [49]:
# best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
In [50]:
# model_1 = tuner.hypermodel.build(best_hps)
# print(model_1.summary())

The parameters can be seen in the LOG.log file. We can now build a similar model.

In [51]:
# for reusability wrap into a func
def build():
    inp = Input(shape=(df_train.shape[-1]-1))
    x = Dense(1000, activation='relu')(inp)
    x = Dropout(.5)(x)
    x = Dense(100, activation='relu')(x)
    x = Dropout(.2)(x)
    out = Dense(1, activation='linear')(x)

    model = Model(inputs=inp, outputs=out)
    model.compile(optimizer='adam', loss=tf.keras.losses.MSE)

    print(model.summary())

    return model
In [52]:
# define a checkpont callback
callbacks = [
    ModelCheckpoint(f'./data/DL/model_weights_best.h5',
                             monitor='val_loss', verbose=1,
                             save_best_only=True, mode='min')
]
# get the model
model = build()

# fit
hist = model.fit(x=x_train,
                 y=y_train,
                 validation_data=(x_eval, y_eval),
                 epochs=150,
                 batch_size=32,
                 callbacks=callbacks,
                 verbose=0)
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_1 (InputLayer)        [(None, 333)]             0         
                                                                 
 dense (Dense)               (None, 1000)              334000    
                                                                 
 dropout (Dropout)           (None, 1000)              0         
                                                                 
 dense_1 (Dense)             (None, 100)               100100    
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 101       
                                                                 
=================================================================
Total params: 434,201
Trainable params: 434,201
Non-trainable params: 0
_________________________________________________________________
None

Epoch 1: val_loss improved from inf to 0.41158, saving model to ./data/DL\model_weights_best.h5

Epoch 2: val_loss improved from 0.41158 to 0.28557, saving model to ./data/DL\model_weights_best.h5

Epoch 3: val_loss improved from 0.28557 to 0.25773, saving model to ./data/DL\model_weights_best.h5

Epoch 4: val_loss improved from 0.25773 to 0.19066, saving model to ./data/DL\model_weights_best.h5

Epoch 5: val_loss did not improve from 0.19066

Epoch 6: val_loss did not improve from 0.19066

Epoch 7: val_loss did not improve from 0.19066

Epoch 8: val_loss did not improve from 0.19066

Epoch 9: val_loss improved from 0.19066 to 0.14923, saving model to ./data/DL\model_weights_best.h5

Epoch 10: val_loss did not improve from 0.14923

Epoch 11: val_loss improved from 0.14923 to 0.13262, saving model to ./data/DL\model_weights_best.h5

Epoch 12: val_loss did not improve from 0.13262

Epoch 13: val_loss did not improve from 0.13262

Epoch 14: val_loss improved from 0.13262 to 0.12666, saving model to ./data/DL\model_weights_best.h5

Epoch 15: val_loss did not improve from 0.12666

Epoch 16: val_loss did not improve from 0.12666

Epoch 17: val_loss did not improve from 0.12666

Epoch 18: val_loss did not improve from 0.12666

Epoch 19: val_loss did not improve from 0.12666

Epoch 20: val_loss improved from 0.12666 to 0.11815, saving model to ./data/DL\model_weights_best.h5

Epoch 21: val_loss did not improve from 0.11815

Epoch 22: val_loss did not improve from 0.11815

Epoch 23: val_loss did not improve from 0.11815

Epoch 24: val_loss did not improve from 0.11815

Epoch 25: val_loss did not improve from 0.11815

Epoch 26: val_loss did not improve from 0.11815

Epoch 27: val_loss did not improve from 0.11815

Epoch 28: val_loss did not improve from 0.11815

Epoch 29: val_loss did not improve from 0.11815

Epoch 30: val_loss did not improve from 0.11815

Epoch 31: val_loss improved from 0.11815 to 0.11511, saving model to ./data/DL\model_weights_best.h5

Epoch 32: val_loss did not improve from 0.11511

Epoch 33: val_loss did not improve from 0.11511

Epoch 34: val_loss did not improve from 0.11511

Epoch 35: val_loss did not improve from 0.11511

Epoch 36: val_loss did not improve from 0.11511

Epoch 37: val_loss did not improve from 0.11511

Epoch 38: val_loss improved from 0.11511 to 0.11296, saving model to ./data/DL\model_weights_best.h5

Epoch 39: val_loss did not improve from 0.11296

Epoch 40: val_loss did not improve from 0.11296

Epoch 41: val_loss did not improve from 0.11296

Epoch 42: val_loss did not improve from 0.11296

Epoch 43: val_loss did not improve from 0.11296

Epoch 44: val_loss did not improve from 0.11296

Epoch 45: val_loss did not improve from 0.11296

Epoch 46: val_loss did not improve from 0.11296

Epoch 47: val_loss did not improve from 0.11296

Epoch 48: val_loss did not improve from 0.11296

Epoch 49: val_loss did not improve from 0.11296

Epoch 50: val_loss did not improve from 0.11296

Epoch 51: val_loss did not improve from 0.11296

Epoch 52: val_loss did not improve from 0.11296

Epoch 53: val_loss did not improve from 0.11296

Epoch 54: val_loss did not improve from 0.11296

Epoch 55: val_loss did not improve from 0.11296

Epoch 56: val_loss did not improve from 0.11296

Epoch 57: val_loss did not improve from 0.11296

Epoch 58: val_loss did not improve from 0.11296

Epoch 59: val_loss did not improve from 0.11296

Epoch 60: val_loss did not improve from 0.11296

Epoch 61: val_loss did not improve from 0.11296

Epoch 62: val_loss did not improve from 0.11296

Epoch 63: val_loss did not improve from 0.11296

Epoch 64: val_loss did not improve from 0.11296

Epoch 65: val_loss did not improve from 0.11296

Epoch 66: val_loss did not improve from 0.11296

Epoch 67: val_loss did not improve from 0.11296

Epoch 68: val_loss did not improve from 0.11296

Epoch 69: val_loss did not improve from 0.11296

Epoch 70: val_loss did not improve from 0.11296

Epoch 71: val_loss did not improve from 0.11296

Epoch 72: val_loss did not improve from 0.11296

Epoch 73: val_loss did not improve from 0.11296

Epoch 74: val_loss did not improve from 0.11296

Epoch 75: val_loss did not improve from 0.11296

Epoch 76: val_loss did not improve from 0.11296

Epoch 77: val_loss did not improve from 0.11296

Epoch 78: val_loss did not improve from 0.11296

Epoch 79: val_loss did not improve from 0.11296

Epoch 80: val_loss did not improve from 0.11296

Epoch 81: val_loss did not improve from 0.11296

Epoch 82: val_loss did not improve from 0.11296

Epoch 83: val_loss did not improve from 0.11296

Epoch 84: val_loss did not improve from 0.11296

Epoch 85: val_loss did not improve from 0.11296

Epoch 86: val_loss did not improve from 0.11296

Epoch 87: val_loss did not improve from 0.11296

Epoch 88: val_loss did not improve from 0.11296

Epoch 89: val_loss did not improve from 0.11296

Epoch 90: val_loss did not improve from 0.11296

Epoch 91: val_loss did not improve from 0.11296

Epoch 92: val_loss did not improve from 0.11296

Epoch 93: val_loss did not improve from 0.11296

Epoch 94: val_loss did not improve from 0.11296

Epoch 95: val_loss did not improve from 0.11296

Epoch 96: val_loss did not improve from 0.11296

Epoch 97: val_loss did not improve from 0.11296

Epoch 98: val_loss did not improve from 0.11296

Epoch 99: val_loss did not improve from 0.11296

Epoch 100: val_loss did not improve from 0.11296

Epoch 101: val_loss did not improve from 0.11296

Epoch 102: val_loss did not improve from 0.11296

Epoch 103: val_loss did not improve from 0.11296

Epoch 104: val_loss did not improve from 0.11296

Epoch 105: val_loss did not improve from 0.11296

Epoch 106: val_loss did not improve from 0.11296

Epoch 107: val_loss did not improve from 0.11296

Epoch 108: val_loss did not improve from 0.11296

Epoch 109: val_loss did not improve from 0.11296

Epoch 110: val_loss did not improve from 0.11296

Epoch 111: val_loss did not improve from 0.11296

Epoch 112: val_loss did not improve from 0.11296

Epoch 113: val_loss did not improve from 0.11296

Epoch 114: val_loss did not improve from 0.11296

Epoch 115: val_loss did not improve from 0.11296

Epoch 116: val_loss did not improve from 0.11296

Epoch 117: val_loss did not improve from 0.11296

Epoch 118: val_loss did not improve from 0.11296

Epoch 119: val_loss did not improve from 0.11296

Epoch 120: val_loss did not improve from 0.11296

Epoch 121: val_loss did not improve from 0.11296

Epoch 122: val_loss did not improve from 0.11296

Epoch 123: val_loss did not improve from 0.11296

Epoch 124: val_loss did not improve from 0.11296

Epoch 125: val_loss did not improve from 0.11296

Epoch 126: val_loss did not improve from 0.11296

Epoch 127: val_loss did not improve from 0.11296

Epoch 128: val_loss did not improve from 0.11296

Epoch 129: val_loss did not improve from 0.11296

Epoch 130: val_loss did not improve from 0.11296

Epoch 131: val_loss did not improve from 0.11296

Epoch 132: val_loss did not improve from 0.11296

Epoch 133: val_loss did not improve from 0.11296

Epoch 134: val_loss did not improve from 0.11296

Epoch 135: val_loss did not improve from 0.11296

Epoch 136: val_loss did not improve from 0.11296

Epoch 137: val_loss did not improve from 0.11296

Epoch 138: val_loss did not improve from 0.11296

Epoch 139: val_loss did not improve from 0.11296

Epoch 140: val_loss did not improve from 0.11296

Epoch 141: val_loss did not improve from 0.11296

Epoch 142: val_loss did not improve from 0.11296

Epoch 143: val_loss did not improve from 0.11296

Epoch 144: val_loss did not improve from 0.11296

Epoch 145: val_loss did not improve from 0.11296

Epoch 146: val_loss did not improve from 0.11296

Epoch 147: val_loss did not improve from 0.11296

Epoch 148: val_loss did not improve from 0.11296

Epoch 149: val_loss did not improve from 0.11296

Epoch 150: val_loss did not improve from 0.11296
In [53]:
plt.figure(10, (10, 5))
plt.plot(hist.history['val_loss'])
plt.plot(hist.history['loss'])
plt.legend(['val_loss', 'loss'])
Out[53]:
<matplotlib.legend.Legend at 0x242039e5b40>

We can see overfitting starts at around 50 epochs. This is important for when we want to train the model on the whole dataset. Now let's check performance.

In [54]:
# load back
model.load_weights('./data/DL/model_weights_best.h5')

# predict
y_preds = model.predict(x_eval, batch_size=10)
30/30 [==============================] - 0s 2ms/step
In [55]:
# some trickery to get back to the original scale
preds_dummy = np.zeros((x_eval.shape[0], x_eval.shape[1] + 1))
preds_dummy[:, -1] = np.squeeze(y_preds)

y_eval_dummy = np.zeros((x_eval.shape[0], x_eval.shape[1] + 1))
y_eval_dummy[:, -1] = np.squeeze(y_eval)

preds_rescaled = scaler.inverse_transform(preds_dummy)[:, -1]
y_eval_rescaled = scaler.inverse_transform(y_eval_dummy)[:, -1]
In [56]:
# evaluate
rmse(y_eval_rescaled, preds_rescaled)
Out[56]:
0.13422389567364781
In [57]:
# final training on all training data
dl = build()
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_2 (InputLayer)        [(None, 333)]             0         
                                                                 
 dense_3 (Dense)             (None, 1000)              334000    
                                                                 
 dropout_2 (Dropout)         (None, 1000)              0         
                                                                 
 dense_4 (Dense)             (None, 100)               100100    
                                                                 
 dropout_3 (Dropout)         (None, 100)               0         
                                                                 
 dense_5 (Dense)             (None, 1)                 101       
                                                                 
=================================================================
Total params: 434,201
Trainable params: 434,201
Non-trainable params: 0
_________________________________________________________________
None
In [58]:
dl.fit(x=np.vstack((x_train, x_eval)),
                 y=np.vstack((y_train.reshape(-1,1), y_eval.reshape(-1,1))),
                 epochs=50,
                 batch_size=32,
                 verbose=0)
Out[58]:
<keras.callbacks.History at 0x2438c2d2da0>
In [59]:
# normalize test data
x_test = scaler.transform(np.hstack((X_test, np.zeros((X_test.shape[0],1)))))[:,:-1]
C:\Dev\Anaconda\envs\elte_ai\lib\site-packages\sklearn\base.py:450: UserWarning:

X does not have valid feature names, but StandardScaler was fitted with feature names

In [60]:
# predict on test data
y_preds_test = dl.predict(x_test, batch_size=10)
146/146 [==============================] - 0s 1ms/step
In [61]:
# scale back
preds_dummy_test = np.zeros((x_test.shape[0], x_test.shape[1] + 1))
preds_dummy_test[:, -1] = np.squeeze(y_preds_test)
preds_rescaled_test = np.expm1(scaler.inverse_transform(preds_dummy_test)[:, -1])
preds_test_dl = preds_rescaled_test
In [62]:
preds_final_dl = pd.DataFrame.from_dict({"Id": test_IDs, "SalePrice": preds_rescaled_test})
preds_final_dl
Out[62]:
Id SalePrice
0 1461 131534.669693
1 1462 165172.578058
2 1463 179471.864419
3 1464 196345.965907
4 1465 179432.165070
... ... ...
1454 2915 94894.202533
1455 2916 88548.394947
1456 2917 168072.988735
1457 2918 118933.102863
1458 2919 210714.324294

1459 rows × 2 columns

In [63]:
# save dl only freestyle preds
preds_final_dl.to_csv("./data/freestyle_preds.csv", index=None)
In [64]:
# create ensemble preds
preds_test_ensemble = np.mean(np.hstack((preds_test_gb.reshape(-1,1), preds_test_rf.reshape(-1,1), preds_test_dl.reshape(-1,1))), axis=1)
In [65]:
preds_final_ensemble = pd.DataFrame.from_dict({"Id": test_IDs, "SalePrice": preds_test_ensemble})
preds_final_ensemble
Out[65]:
Id SalePrice
0 1461 131212.174619
1 1462 165875.090142
2 1463 176887.468719
3 1464 192306.382069
4 1465 189384.589165
... ... ...
1454 2915 89079.854580
1455 2916 86090.332983
1456 2917 169209.946253
1457 2918 112761.835770
1458 2919 219134.774701

1459 rows × 2 columns

In [66]:
# save dl only freestyle preds -> this got 0.13578 in Kaggle
preds_final_ensemble.to_csv("./data/freestyle_ensemble_preds.csv", index=None)
In [ ]: